First we need to import Packages required for the Analysis and register multiple Cores for parallel computation.
library(scRNAseq.analysis)
library(Seurat)
library("tidyr")
library(cowplot)
library(ggplot2)
library(org.Mm.eg.db)
library(clustree)
library(dplyr)
library(plyr)
library(foreach)
library(clusterProfiler)
library(biomaRt)
library(DOSE)
library(grid)
library(gridExtra)
doMC::registerDoMC(future::availableCores()*0.5)
###only half, as not to use up all resources, change accordingly, if more power required
# system.time(fastSave::load.lbzip2(here::here("RImages/SeqWell_myeolid/Exp_191114","full_analysed.RDataFS"),n.cores=100))
###custom color palette for Seurat, number stands for cluster ID without annotation
my_cols <- c('0'='#006A40FF','1'="#F8E5AE",'2'='#75B41EFF','3'='#95828DFF','4'='#708C98FF',
'5'='#8AB8CFFF','6'='#007E7FFF','7'='#358359FF','8'='#8BA1BCFF','9'='#5A5895FF',
'10'='#F2990CFF','11'='#5A5895FF','12'='#E5BA3AFF',"13"="chartreuse","14"='#F08892FF',"15"="#E10520","16"="#13EF48","17"="#E08A9D","18"="#E59E3C")
my_cols2 <- my_cols[order(as.integer(names(my_cols)))]
### show the ID of the color Then we need to load the Mart datasets to be able to retrieve more information about our genes.
###execute one and save the object
mouse_ensembl<-biomaRt::useMart("ensembl", dataset = "mmusculus_gene_ensembl")
human_ensembl<-biomaRt::useMart("ensembl", dataset = "hsapiens_gene_ensembl")Then we import the matrices from a DropSeq pipeline output folder:
#This path is mounted in Singularity ... to reproduce the results, you can access the data using the following mount in Singularity: --bind /groups/NovaSeq-01/bioinformatics/SeqWell/84_Seq_Beyer_spg15:/data/SeqWell/190613
dir_RNA <- "/data/SeqWell/190613/alignment/2020-07-17/output/results/samples/"
###Lets create a list for Seurat objects for better overview
Seurat_objects<-list()
Seurat_objects$total<-read_mtx(dir=dir_RNA)
#>
#> Attaching package: 'Matrix'
#> The following object is masked from 'package:S4Vectors':
#>
#> expand
#> The following objects are masked from 'package:tidyr':
#>
#> expand, pack, unpackOr import SmartSeq2 Data using import_SS2_kallisto():
##use ensembl for gene annotation (standard output is in ensembleID, thus we use biomart to convert to human readable gene symbols)
mouse_ensembl<-biomaRt::useMart("ensembl", dataset = "mmusculus_gene_ensembl")
###same as above: --bind /groups/NovaSeq-01/bioinformatics/RNA/124_2019_SS2_Beyer_CNS_TCells_210414:/data/SS2/190613
Seurat_objects$SS2_190613<-import_SS2_kallisto("/data/SS2/190613/alignment/2021-04-15--17-09-37/output/kallisto",mouse_ensembl)The Metadata in our case is directly contained in the Cell name, thus we can just extract it:
meta<-separate(as.data.frame(colnames(Seurat_objects$total)),col = "colnames(Seurat_objects$total)",sep = "_",into=c(NA,"Genotype","Mouse",NA,"Pool"),remove = F)
rownames(meta)<-meta$`colnames(Seurat_objects$total)`
meta<-meta[,-1]
Seurat_objects$total<-AddMetaData(Seurat_objects$total,meta)We can furthermore add the Percentage of mitochondrial genes:
###lets first find genes, which are located on the mitochondrial chromosome:
mito.genes<-biomaRt::getBM(attributes=c('external_gene_name', 'transcript_biotype',"chromosome_name"),
filters = 'chromosome_name',
values = "MT",
mart = mouse_ensembl)
##you need to subset first the genes which are actually present in the seurat object
mito.genes<-mito.genes$external_gene_name[mito.genes$external_gene_name%in%rownames(Seurat_objects$total)]
##add percentage of this genes
Seurat_objects$total<-MetaFeature(Seurat_objects$total,features = as.character(mito.genes),meta.name = "percent.mito")First check the alignment quality:
##multiqc statistics
path_191114<-"/data/SeqWell/191114/alignment/2020-03-13/output/results"
align_stats(path_191114,subset="ADT",exclude=T,legend.pos = "right",ymax=150000000,title="191114_v2")
#> Using ID as id variables
#> Scale for 'y' is already present. Adding another scale for 'y', which will
#> replace the existing scale.Here you can see that all the subsamples from our dataset show similar alignment quality. Around 50-70% of the reads were aligned and most of the reads survived filtering, which is an okay quality. The sequencing depth varied slightly, on average a number of reads in the range of 5.0e+07 was observed.
We can also check the composition of our transcripts:
### To find the biotypes of transcripts a Biomart object is required. Lets use the one from above
genetypes_RNA<-gene_types(Seurat_objects$total,mart.use = mouse_ensembl)
#> Joining, by = "SYMBOL"
###draw_plot is also a function of cowplot, thus use ::
genestype_plot<-scRNAseq.analysis::draw_plot(ggplot(genetypes_RNA,aes(x=TYPE,y=SUM)),xlab="",ylab="Sum over all cells",plot=geom_jitter(height = 0, width = 0.1),legend="none")+scale_y_log10()+coord_flip()
genestype_plotYou can see that the majority of transcripts are protein coding genes. Other Transcript types may not be very informative, thus we leave them out for clustering and cluster annotation. Furthermore we’ll remove ribosomal genes, which are also less informative but highly expressed:
###lets find all genes annotated as rRNA in biomart
ribo.genes<-biomaRt::getBM(attributes=c('external_gene_name', 'transcript_biotype',"chromosome_name"),
filters = 'biotype',
values = "rRNA",
mart = mouse_ensembl)
##not all of them are annotated, thus we select based on known terms
ribo.genes <- c(grep(pattern =c("^Rpl|^Rps|^Mrps|^Mrpl"), x = rownames(Seurat_objects$total),
value = TRUE),ribo.genes$external_gene_name)
#keep protein coding only
Seurat_objects$filtered<-subset(Seurat_objects$total,features=genetypes_RNA[genetypes_RNA$TYPE=="protein_coding",]$SYMBOL)
#> Warning in subset.Assay(x = x[[assay]], cells = cells, features =
#> assay.features): NAs passed in the features vector, removing NAs
###remove ribosomal genes
filt_genes<-rownames(Seurat_objects$filtered)
filt_genes<-filt_genes[which(!filt_genes%in%ribo.genes)]
Seurat_objects$filtered<-subset(Seurat_objects$total,features=filt_genes)
Now lets include low quality cells. We start with removing empty wells:
###save our data in extra object, so we do not loose the previous state, when trying different subsetting criteria.
dataset<-Seurat_objects$filtered
#UMIs per cell####
UMIs_cell(seurat_total = dataset,cutoff = 40)
#> Warning: Transformation introduced infinite values in continuous y-axis
dataset<-subset(dataset,subset = nCount_RNA>40)These are all cells, in which the cell size (rank here) is not proportional to the #UMIs (all cells below red dotted line).
Another posibility to look at the quality of cells is the following plot:
dataset<-subset(dataset,subset=nFeature_RNA>370&nCount_RNA>500)
umis_genes_color(dataset)###standard color is finding column in meta.data called "percent.mito" ... adjust accordingly
###if you are done save the subsetted dataset in our list
Seurat_objects$postQC<-datasetYou should try to remove the tails in the histogram, so that we keep only high quality cells. Here we subsetted for at least 370 genes and 500 Umis. No cells with high percent mitochondrial were observed in the lower left quadrant and the mito content is generally low. Thus we do not need to subset for percent.mito.
We still have to exclude doublets and correct for ambient gene expression. But for this steps a first clustering result should be available. Thus we first start with dimensionality reduction and clustering. The scRNAseq analysis is split into 3 Parts:
Here we use a wrapper function for all these parts called seurat_flow().
###with Seurat >4.0 you need to make sure that Matrix <1.3.2 is installed otherwise following error,when executing FindNeighbors():
#Error in validObject(.Object) :
#invalid class “Graph” object: superclass "Mnumeric" not defined in the environment of the object's class
### use the following commands:
# remotes::install_version("Matrix", version = "1.3.2", repos = "http://cran.us.r-project.org")
###umap-learn need to be installed beforehand, if that doesn't work just remove ,umap="uwot"
Seurat_objects$postQC<-seurat_flow(Seurat_objects$postQC,dim=1:21,sct=T,norm=F,umap="uwot") To determine the number of dimensions to use, check the elbow in ElbowPlot().
We have performed clustering using the standard resolution of 0.7. Let’s see if we can make a more educated guess on the number of cluster available in our dataset.
To that purpose, two packages can be made use of. The first one is called clustree and it just shows us a graph like comparison between the different clustering resolutions. The resolution is selected at the point, when the clusters (nodes) start to mix up between resolutions (mixed edges).
###First remove previous clustering runs
Seurat_objects$postQC@meta.data<-Seurat_objects$postQC[[]][,-grep("SCT_",names(Seurat_objects$postQC[[]]),fixed = T)]
###perform clustering at resolutions between 0.1 and two in 0.2 steps for example for both Genotypes
for(i in seq(0,1,0.1)){
Seurat_objects$postQC <- FindClusters(Seurat_objects$postQC, resolution = i)
}
#> Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
#>
#> Number of nodes: 5378
#> Number of edges: 218128
#>
#> Running Louvain algorithm...
#> Maximum modularity in 10 random starts: 1.0000
#> Number of communities: 1
#> Elapsed time: 0 seconds
#> Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
#>
#> Number of nodes: 5378
#> Number of edges: 218128
#>
#> Running Louvain algorithm...
#> Maximum modularity in 10 random starts: 0.9385
#> Number of communities: 5
#> Elapsed time: 0 seconds
#> Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
#>
#> Number of nodes: 5378
#> Number of edges: 218128
#>
#> Running Louvain algorithm...
#> Maximum modularity in 10 random starts: 0.9055
#> Number of communities: 7
#> Elapsed time: 0 seconds
#> Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
#>
#> Number of nodes: 5378
#> Number of edges: 218128
#>
#> Running Louvain algorithm...
#> Maximum modularity in 10 random starts: 0.8843
#> Number of communities: 7
#> Elapsed time: 0 seconds
#> Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
#>
#> Number of nodes: 5378
#> Number of edges: 218128
#>
#> Running Louvain algorithm...
#> Maximum modularity in 10 random starts: 0.8656
#> Number of communities: 8
#> Elapsed time: 0 seconds
#> Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
#>
#> Number of nodes: 5378
#> Number of edges: 218128
#>
#> Running Louvain algorithm...
#> Maximum modularity in 10 random starts: 0.8493
#> Number of communities: 9
#> Elapsed time: 0 seconds
#> Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
#>
#> Number of nodes: 5378
#> Number of edges: 218128
#>
#> Running Louvain algorithm...
#> Maximum modularity in 10 random starts: 0.8347
#> Number of communities: 11
#> Elapsed time: 0 seconds
#> Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
#>
#> Number of nodes: 5378
#> Number of edges: 218128
#>
#> Running Louvain algorithm...
#> Maximum modularity in 10 random starts: 0.8210
#> Number of communities: 11
#> Elapsed time: 0 seconds
#> Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
#>
#> Number of nodes: 5378
#> Number of edges: 218128
#>
#> Running Louvain algorithm...
#> Maximum modularity in 10 random starts: 0.8081
#> Number of communities: 12
#> Elapsed time: 0 seconds
#> Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
#>
#> Number of nodes: 5378
#> Number of edges: 218128
#>
#> Running Louvain algorithm...
#> Maximum modularity in 10 random starts: 0.7975
#> Number of communities: 14
#> Elapsed time: 0 seconds
#> Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
#>
#> Number of nodes: 5378
#> Number of edges: 218128
#>
#> Running Louvain algorithm...
#> Maximum modularity in 10 random starts: 0.7865
#> Number of communities: 15
#> Elapsed time: 0 seconds
#Clustree is used to select the proper resolution. In the plot, in.props means the number of
# cells in the edge / number of cells in node where edge is pointing to. If Clusters with a lower
# resolution are sharing cell in a higher resolution, this resolution is not adequate. You need to set the prefix for the clusters at
# different resolutions in the metadata, the general naming is the following: <Assay>_<Clustering_method>_res.
###check the prefix to use in the metadata colnames
###execute clustree
clustree(Seurat_objects$postQC, prefix = "SCT_snn_res.")
#> Warning: The `add` argument of `group_by()` is deprecated as of dplyr 1.0.0.
#> Please use the `.add` argument instead.DimPlot(Seurat_objects$postQC)###if only empty wells excluded
Idents(Seurat_objects$postQC)<-Seurat_objects$postQC[["SCT_snn_res.0.4"]]You can see that there are singular mixes in low resolution and from 0.7 there appear to be slightly less stability. Since we still need to do some QC, lets not go into much detail and take for now resolution 0.4, where the clusters were stable for two resolutions. For doublet and ambient gene correction, we also need to check if the cluster roughly make sense. So let’s just quickly check the umap and marker genes.
library(dplyr)
dataset.markers<-list()
dataset.markers$postQC<-FindAllMarkers(Seurat_objects$postQC)
#> Calculating cluster 0
#> Calculating cluster 1
#> Calculating cluster 2
#> Calculating cluster 3
#> Calculating cluster 4
#> Calculating cluster 5
#> Calculating cluster 6
#> Calculating cluster 7
top_genes<-list()
###functions used in different packages as well, thus specify dplyr
top_genes$postQC<- dataset.markers$postQC %>% dplyr::group_by(cluster) %>% dplyr::top_n(n = 5,
wt = avg_log2FC)
plot_grid(DoHeatmap(Seurat_objects$postQC, features = top_genes$postQC$gene)+NoLegend(),DimPlot(Seurat_objects$postQC,label=T)+NoLegend()+NoAxes())
#> Warning in DoHeatmap(Seurat_objects$postQC, features = top_genes$postQC$gene):
#> The following features were omitted as they were not found in the scale.data
#> slot for the SCT assay: Rack1, Csf1r You can see, that the cluster are not mixed up and the marker genes make sense.
Ambient genes are especially abundant in droplet or pico-well Sequencing technologies. Thus, it is advisable to perform correction for ambient genes for this dataset (SeqWell, pico-well).
To that purpose we can use the tool SoupX (constantAmateur/SoupX). Here a wrapper function is used … which extracts the count data from our unfiltered raw dataset and the QC post dataset, with preliminary clusters. The clustering beforehand is required for automatic SoupX correction.
#correct for ambient genes
par(mfrow = c(1,2), xpd=TRUE)
Seurat_objects$nosoup<-remove_ambient(Seurat_objects$postQC,Seurat_objects$filtered)
#> 182 genes passed tf-idf cut-off and 38 soup quantile filter. Taking the top 38.
#> Using 80 independent estimates of rho.
#> Estimated global rho of 0.03
#> Warning in sparseMatrix(i = out@i[w] + 1, j = out@j[w] + 1, x = out@x[w], :
#> 'giveCsparse' has been deprecated; setting 'repr = "T"' for you
#> Expanding counts from 8 clusters to 5378 cells.
mito.genes<-mito.genes[mito.genes%in%rownames(Seurat_objects$nosoup)]
Seurat_objects$nosoup<-MetaFeature(Seurat_objects$nosoup,mito.genes,"percent.mito")
plot_grid(umis_genes_color(Seurat_objects$postQC),umis_genes_color(Seurat_objects$nosoup), labels = c("Pre-correction","Post-correction"))From this plot we can see, that the maximal estimated contamination fraction is 3%, which lays in the range of expected contamination in SeqWell experiments.
Seurat_objects$nosoup<-seurat_flow(Seurat_objects$nosoup,res=0.4,dim=1:21,umap="uwot")
#compare ambient gene corrected dataset and unGenotcorrected
##re-add percent.mito
plot_grid(umis_genes_color(Seurat_objects$postQC),umis_genes_color(Seurat_objects$nosoup), labels = c("Pre-correction","Post-correction"))You can already see a huge difference in the QC metrics. It seems that cells with very high number of genes and UMIs mainly contained information about ambient genes. The correction, made also the cells more comparable, bringing them into a similar range. Let’s also check the clustering. Here the UMAPs:
dataset.markers$nosoup<-FindAllMarkers(Seurat_objects$nosoup, only.pos = TRUE)
#> Calculating cluster 0
#> Calculating cluster 1
#> Calculating cluster 2
#> Calculating cluster 3
#> Calculating cluster 4
#> Calculating cluster 5
#> Calculating cluster 6
#> Calculating cluster 7
#> Calculating cluster 8
top_genes$nosoup<- dataset.markers$nosoup %>% dplyr::group_by(cluster) %>% dplyr::top_n(n = 5,
wt = avg_log2FC)
plot_grid(DimPlot(Seurat_objects$postQC,label=T)+NoLegend()+NoAxes(),DimPlot(Seurat_objects$nosoup,label=T)+NoLegend()+NoAxes(), labels = c("Pre-correction","Post-correction"))You can see that we find more clusters with the same resolution and the clusters found before separate also better after correction. Lets check The marker genes as well:
plot_grid(DoHeatmap(Seurat_objects$postQC, features = top_genes$postQC$gene)+NoLegend(),DoHeatmap(Seurat_objects$nosoup, features = top_genes$nosoup$gene)+NoLegend(), labels = c("Pre-correction","Post-correction"))
#> Warning in DoHeatmap(Seurat_objects$postQC, features = top_genes$postQC$gene):
#> The following features were omitted as they were not found in the scale.data
#> slot for the SCT assay: Rack1, Csf1r
#> Warning in DoHeatmap(Seurat_objects$nosoup, features = top_genes$nosoup$gene):
#> The following features were omitted as they were not found in the scale.data
#> slot for the SCT assay: Rack1, Csf1r, Cx3cr1The marker genes are similar, although after correction Nfkbia and Mrc1 appear, which tell us more about the state (Nfkb=Inflammation) and identity (Mrc1 = Macrophage) of certain populations.
It is recommended to perform doublet detection separately on each of the present conditions. In our case, we perform the detection on the Genotypes WT and KO. Prior to run the wrapper function for DoubletFinder, the QC for the separate Genotypes might be adjusted slightly and uninformative clusters removed. I will not go here into detail, because the major steps were already explained above:
Seurat_objects[c("WT","KO")]<-NULL
Genotypes<-SplitObject(Seurat_objects$nosoup,split.by = "Genotype")
Seurat_objects<-c(Seurat_objects,Genotypes)
###We needed Genotypes only as a helper variable, so lets remove it again
rm(Genotypes)
##rerrun seurat for soup corrected genotypes
###do not use umap-learn when performing seurat_flow in parallel, otherwise error:
# Terminating: fork() called from a process already using GNU OpenMP, this is unsafe.
Seurat_objects[c("WT","KO")]<- llply(.data = Seurat_objects[c("WT","KO")], .fun =seurat_flow,res=.6,dim=1:21,sct=T,alg=1,.parallel = T,.paropts = list(.packages = "Seurat"))
###analyse WT first
for(i in seq(0.1,1.0,0.1)){
Seurat_objects$WT<- FindClusters(Seurat_objects$WT, resolution = i)
}
# clustree(Seurat_objects$WT, prefix = "SCT_snn_res.")
Idents(Seurat_objects$WT)<-Seurat_objects$WT[["SCT_snn_res.0.6"]]
# VlnPlot(Seurat_objects$WT,features = "nCount_RNA")
Seurat_objects$WT<-subset(Seurat_objects$WT,subset=nCount_RNA>580)
# VlnPlot(Seurat_objects$WT,features = "percent.mito")
###analyse KO
for(i in seq(1.0,2.0,0.1)){
Seurat_objects$KO<- FindClusters(Seurat_objects$KO, resolution = i)
}
# Seurat_objects$KO@meta.data<-Seurat_objects$KO[[]][,-grep("SCT_",colnames(Seurat_objects$KO[[]]),fixed = T)]
# clustree(Seurat_objects$KO, prefix = "SCT_snn_res.")
Idents(Seurat_objects$KO)<-Seurat_objects$KO[["SCT_snn_res.1.3"]]
# plot_grid(VlnPlot(Seurat_objects$KO,features = "nCount_RNA"),VlnPlot(Seurat_objects$KO,features = "percent.mito"))
Seurat_objects$KO<-subset(Seurat_objects$KO,subset=nCount_RNA>580)
dataset.markers[c("WT","KO")]<-llply(Seurat_objects[c("WT","KO")],FindAllMarkers, only.pos = TRUE,.parallel=T,.paropts = list(.packages="Seurat"))
for(i in c("WT","KO")){
top_genes[[i]]<- dataset.markers[[i]] %>% dplyr::group_by(cluster) %>% dplyr::top_n(n = 5,
wt = avg_log2FC)
}
plot_grid(plot_grid(DoHeatmap(Seurat_objects$KO,top_genes$KO$gene)+NoLegend(),
DimPlot(Seurat_objects$KO)+NoAxes()),plot_grid(DoHeatmap(Seurat_objects$WT,top_genes$WT$gene)+NoLegend(),
DimPlot(Seurat_objects$WT)+NoAxes()))So now that we now that our clusters make sense, we can use DoubletFinder.
Seurat_objects$WT<-doublets(Seurat_objects$WT,dim = 1:21)
Seurat_objects$KO<-doublets(Seurat_objects$KO,dim = 1:21)
###the meta col name for doublets might vary slightly, but starts always with "DF.classifications"
name_doublet_wt<-grep("^DF.classifications_",names(Seurat_objects$WT[[]]),value = T)
name_doublet_ko<-grep("^DF.classifications_",names(Seurat_objects$KO[[]]),value = T)
###visualise doublets
plot_grid(plot_grid(DimPlot(Seurat_objects$WT,group.by = name_doublet_wt)+labs(title=""),DimPlot(Seurat_objects$WT)),
plot_grid(DimPlot(Seurat_objects$KO,group.by = name_doublet_ko)+labs(title=""),DimPlot(Seurat_objects$KO)),labels = c("WT","KO"))
###lets remove the doublets
###since we do not know the exact column way, we need to use Fetch data as a workaround
expr <- FetchData(Seurat_objects$WT, vars = name_doublet_wt)
Seurat_objects$WT <- Seurat_objects$WT[, which(expr == "Singlet")]
expr <- FetchData(Seurat_objects$KO, vars = name_doublet_ko)
Seurat_objects$KO <- Seurat_objects$KO[, which(expr == "Singlet")]The number of doublets is low in this datasets, let’s remove the ones which were found.
Lets now merge again the corrected Genotypes together:
###The Genotypes were merged after ambient gene and Doublet removal
###Error: Error in UseMethod(generic = "DefaultAssay", object = object) :
# no applicable method for 'DefaultAssay' applied to an object of class "NULL"
#Thus genotypes are saved intermediately
WT<-Seurat_objects$WT
KO<-Seurat_objects$KO
Seurat_objects$processed<-merge(WT,KO)
###remove the intermediate variables again
rm(WT,KO)###WT should always appear at the left side
Seurat_objects$processed@meta.data$Genotype<-relevel(as.factor(Seurat_objects$processed@meta.data$Genotype),"WT")
###umap-learn needs to be installed... if error just remove umap="umap-learn"
Seurat_objects$processed<-seurat_flow(Seurat_objects$processed,res=0.5,dim = 1:21,umap="uwot",only.var = F)
plot_grid(DimPlot(Seurat_objects$processed,reduction="umap",group.by = "Genotype")+NoAxes()+NoLegend(),
DimPlot(Seurat_objects$processed,reduction="umap",group.by = "Mouse")+NoAxes()+NoLegend(),
DimPlot(Seurat_objects$processed,reduction="umap",group.by = "Pool")+NoAxes()+NoLegend())###The cells do not separate according to the genotypes ... thus batch correction or integration not required, Samples seems also to be equally distributed
names(Seurat_objects$processed[[]])
#> [1] "orig.ident" "nCount_RNA"
#> [3] "nFeature_RNA" "zeros"
#> [5] "Genotype" "Mouse"
#> [7] "Pool" "percent.mito"
#> [9] "nCount_SCT" "nFeature_SCT"
#> [11] "SCT_snn_res.0.4" "seurat_clusters"
#> [13] "SCT_snn_res.0.6" "SCT_snn_res.0.1"
#> [15] "SCT_snn_res.0.2" "SCT_snn_res.0.3"
#> [17] "SCT_snn_res.0.5" "SCT_snn_res.0.7"
#> [19] "SCT_snn_res.0.8" "SCT_snn_res.0.9"
#> [21] "SCT_snn_res.1" "pANN_0.25_0.03_33"
#> [23] "DF.classifications_0.25_0.03_33" "SCT_snn_res.1.1"
#> [25] "SCT_snn_res.1.2" "SCT_snn_res.1.3"
#> [27] "SCT_snn_res.1.4" "SCT_snn_res.1.5"
#> [29] "SCT_snn_res.1.6" "SCT_snn_res.1.7"
#> [31] "SCT_snn_res.1.8" "SCT_snn_res.1.9"
#> [33] "SCT_snn_res.2" "pANN_0.25_0.03_31"
#> [35] "DF.classifications_0.25_0.03_31"
#remove all metadata,which is not required anymore
Seurat_objects$processed@meta.data<-Seurat_objects$processed[[]][,-grep("SCT_",names(Seurat_objects$processed[[]]),fixed = T)]
# Seurat_objects$processed@meta.data<-Seurat_objects$processed[[]][,-grep("DF.classifications",names(Seurat_objects$processed[[]]),fixed = T)]
# Seurat_objects$processed@meta.data<-Seurat_objects$processed[[]][,-grep("pANN_",names(Seurat_objects$processed[[]]),fixed = T)]
###find optimal #cluster using clustree
for(i in seq(0.1,1.2,0.1)){
Seurat_objects$processed<- FindClusters(Seurat_objects$processed, resolution = i)
}
#> Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
#>
#> Number of nodes: 5308
#> Number of edges: 217614
#>
#> Running Louvain algorithm...
#> Maximum modularity in 10 random starts: 0.9440
#> Number of communities: 6
#> Elapsed time: 0 seconds
#> Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
#>
#> Number of nodes: 5308
#> Number of edges: 217614
#>
#> Running Louvain algorithm...
#> Maximum modularity in 10 random starts: 0.9167
#> Number of communities: 7
#> Elapsed time: 0 seconds
#> Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
#>
#> Number of nodes: 5308
#> Number of edges: 217614
#>
#> Running Louvain algorithm...
#> Maximum modularity in 10 random starts: 0.8978
#> Number of communities: 10
#> Elapsed time: 0 seconds
#> Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
#>
#> Number of nodes: 5308
#> Number of edges: 217614
#>
#> Running Louvain algorithm...
#> Maximum modularity in 10 random starts: 0.8817
#> Number of communities: 10
#> Elapsed time: 0 seconds
#> Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
#>
#> Number of nodes: 5308
#> Number of edges: 217614
#>
#> Running Louvain algorithm...
#> Maximum modularity in 10 random starts: 0.8655
#> Number of communities: 10
#> Elapsed time: 0 seconds
#> Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
#>
#> Number of nodes: 5308
#> Number of edges: 217614
#>
#> Running Louvain algorithm...
#> Maximum modularity in 10 random starts: 0.8513
#> Number of communities: 11
#> Elapsed time: 0 seconds
#> Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
#>
#> Number of nodes: 5308
#> Number of edges: 217614
#>
#> Running Louvain algorithm...
#> Maximum modularity in 10 random starts: 0.8389
#> Number of communities: 13
#> Elapsed time: 0 seconds
#> Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
#>
#> Number of nodes: 5308
#> Number of edges: 217614
#>
#> Running Louvain algorithm...
#> Maximum modularity in 10 random starts: 0.8273
#> Number of communities: 14
#> Elapsed time: 0 seconds
#> Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
#>
#> Number of nodes: 5308
#> Number of edges: 217614
#>
#> Running Louvain algorithm...
#> Maximum modularity in 10 random starts: 0.8170
#> Number of communities: 15
#> Elapsed time: 0 seconds
#> Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
#>
#> Number of nodes: 5308
#> Number of edges: 217614
#>
#> Running Louvain algorithm...
#> Maximum modularity in 10 random starts: 0.8077
#> Number of communities: 15
#> Elapsed time: 0 seconds
#> Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
#>
#> Number of nodes: 5308
#> Number of edges: 217614
#>
#> Running Louvain algorithm...
#> Maximum modularity in 10 random starts: 0.7983
#> Number of communities: 16
#> Elapsed time: 0 seconds
#> Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
#>
#> Number of nodes: 5308
#> Number of edges: 217614
#>
#> Running Louvain algorithm...
#> Maximum modularity in 10 random starts: 0.7897
#> Number of communities: 16
#> Elapsed time: 0 seconds
clustree(Seurat_objects$processed, prefix = "SCT_snn_res.")Now that we have are completely QC corrected dataset, Let’s use the Nbclust tool to determine the optimal number of clusters. Nbclust performs a validation for the optimal number of cluster, using 33 Tools. The function optimal_n_cluster is a wrapper. It first executes NBclust with all available tools in parallel and then saves the results in class “nbclust”:
optimal_clusters<-optimal_n_cluster(Seurat_objects$processed,min.nc = 10,max.nc = 25)
#> Warning: Removed 2 row(s) containing missing values (geom_path).
#> Warning: Removed 2 rows containing missing values (geom_point).
optimal_clusters@top3
#> [1] "22" "10" "12"
DimPlot(Seurat_objects$processed)One could sometimes also find plots in optimal_clusters@plots, here it did not work. But the top 3 number of clusters from all the tools ,were 10,14,12 clusters. Together with the results of nbclust, 10 clusters seems to be optimal. Upon visual inspection of the UMAP, further subgroups in the clusters 3,4,5,1 and 9 were found, which we were not able to separate, when considering all cells. For these cells we can perform subclustering:
Lets first remove T cells from analysis:
###set best resolution
cluster_ids<-list()
Idents(Seurat_objects$processed)<-Seurat_objects$processed[["SCT_snn_res.0.5"]]
dataset.markers$processed<-FindAllMarkers(Seurat_objects$processed)
top_genes$processed<- dataset.markers$processed %>% group_by(cluster) %>% top_n(n = 5,
wt = avg_log2FC)
DoHeatmap(Seurat_objects$processed,top_genes$processed$gene,group.colors = my_cols2)+NoLegend()
###remove Cd8+ cells
Seurat_objects$processed<-subset(Seurat_objects$processed,idents=seq(0,9,1)[-8])
###need to rerun seurat flow after subsetting
Seurat_objects$processed<-seurat_flow(Seurat_objects$processed,res=0.5,dim = 1:21,umap="uwot",only.var = F)
####marker genes
### Visual inspection showed possible subclusters for cluster 3,4,5 ... thus performed subclustering
Seurat_objects$processed<-FindSubCluster(Seurat_objects$processed,cluster = "3",graph.name = "SCT_snn",res=0.25)
#> Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
#>
#> Number of nodes: 610
#> Number of edges: 19276
#>
#> Running Louvain algorithm...
#> Maximum modularity in 10 random starts: 0.7735
#> Number of communities: 3
#> Elapsed time: 0 seconds
Idents(Seurat_objects$processed)<-Seurat_objects$processed$sub.cluster
Seurat_objects$processed<-FindSubCluster(Seurat_objects$processed,cluster = "4",graph.name = "SCT_snn",resolution =0.15)
#> Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
#>
#> Number of nodes: 405
#> Number of edges: 14031
#>
#> Running Louvain algorithm...
#> Maximum modularity in 10 random starts: 0.8659
#> Number of communities: 2
#> Elapsed time: 0 seconds
Idents(Seurat_objects$processed)<-Seurat_objects$processed$sub.cluster
Seurat_objects$processed<-FindSubCluster(Seurat_objects$processed,cluster = "2",graph.name = "SCT_snn",resolution = 0.35)
#> Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
#>
#> Number of nodes: 701
#> Number of edges: 24099
#>
#> Running Louvain algorithm...
#> Maximum modularity in 10 random starts: 0.7425
#> Number of communities: 3
#> Elapsed time: 0 seconds
Idents(Seurat_objects$processed)<-Seurat_objects$processed$sub.cluster
DimPlot(Seurat_objects$processed,label=T)Now you can see, that all cluster capture one homogenous cell group in the UMAP (at least visually). Now we can identify the clusters:
dataset.markers$processed<-FindAllMarkers(Seurat_objects$processed)
#> Calculating cluster 1
#> Calculating cluster 8
#> Calculating cluster 3_0
#> Calculating cluster 2_1
#> Calculating cluster 2_0
#> Calculating cluster 0
#> Calculating cluster 5
#> Calculating cluster 2_2
#> Calculating cluster 4_0
#> Calculating cluster 3_1
#> Calculating cluster 7
#> Calculating cluster 9
#> Calculating cluster 3_2
#> Calculating cluster 10
#> Calculating cluster 4_1
#> Calculating cluster 6
top_genes$processed<- dataset.markers$processed %>% group_by(cluster) %>% top_n(n = 5,
wt = avg_log2FC)
####rename cluster ####
cluster_ids$processed <- c("ss_MG","early_NP","int_homeo_MG","homeo_DAM","inflam_DAM","int_MG","int_NP","type_1_MF","CAM","homeo_MG","inflam_MG","inflam_NP","SC","MC","IR_MF","IR_DAM")
###asign the clusters to be changed, as names to the new cluster identities
names(cluster_ids$processed) <- levels(Idents(Seurat_objects$processed))
### renamve the clusters based on this named vector
Seurat_objects$processed <- RenameIdents(Seurat_objects$processed, cluster_ids$processed)
#check if the cluster IDs were changed successfully
levels(Idents(Seurat_objects$processed))
#> [1] "ss_MG" "early_NP" "int_homeo_MG" "homeo_DAM" "inflam_DAM"
#> [6] "int_MG" "int_NP" "type_1_MF" "CAM" "homeo_MG"
#> [11] "inflam_MG" "inflam_NP" "SC" "MC" "IR_MF"
#> [16] "IR_DAM"
##don't forget to rename the color palette for the new cluster names
names(my_cols)<-levels(Idents(Seurat_objects$processed))
my_cols2 <- my_cols[order(as.integer(names(my_cols)))]
#> Warning in eval(quote(list(...)), env): NAs introduced by coercion
plot_grid(DoHeatmap(Seurat_objects$processed,top_genes$processed$gene,group.colors = my_cols2)+NoLegend(),DimPlot(Seurat_objects$processed,label = T,cols = my_cols2)+NoAxes()+NoLegend())Since we know the identities of our clusters, we can check the differences between the genotypes, lets first create a Barplot showing the cluster abundances:
####Cluster cell numbers####
###extract frequencies of each seurat clusters splitted into genotypes as table from Seurat object total
table_cluster<-table(Seurat_objects$processed@active.ident,Seurat_objects$processed$Genotype)
table_frame<- as.data.frame(table_cluster)
colnames(table_frame)<-c("Seurat_cluster","Genotype","Freq")
table_frame$Genotype<-relevel(table_frame$Genotype,"WT")
####required for the total cell number labels in the plot later
table_geno<-table(Seurat_objects$processed$Genotype)
table_geno<- as.data.frame(table_geno)
table_geno$Seurat_cluster<-"act. MG"
colnames(table_geno)<-c("Genotype","Freq","Seurat_cluster")
table_geno$Genotype<-relevel(table_geno$Genotype,"WT")
#####relative cell numbers
##compare Genotype
###total cell numbers
props_total_t<-table_geno%>%dplyr::group_by(Genotype)%>%dplyr::mutate(total=sum(Freq),prop=sum(Freq/sum(Freq)))
### cell percentage for each cluster
props_total_g<-table_frame%>%dplyr::group_by(Genotype)%>%dplyr::mutate(prop=Freq / sum(Freq))
props_total_g<-as.data.frame(props_total_g)
plot_list_g<-list()
plot_list_g<-sapply(colnames(props_total_g),gen_plots,data=props_total_g,list=plot_list_g,xvalue="Genotype",yvalue="prop")
# names(plot_list_g)<-colnames(props_total_g)
##we need only the plot for the clusters splitted into genotypes
plot_list_g<-plot_list_g[c(1)]
graph_list_g<-list()
###you need to have the same number colors for the clusters otherwise NA will be shown
my_cols2<-my_cols2[-c(17,18,19)]
graph_list_g<-lapply(plot_list_g,scRNAseq.analysis::draw_plot,ylab="rel. Abundance",xlab="",plot=geom_col(),cluster_colors = my_cols2)
###plot with percentages and absolute cell numbers across genotypes and cell clusters
cluster_abs<-graph_list_g$Seurat_cluster.col+geom_text(aes(label =paste(Freq,"cells",sep=" ")), position = position_stack(vjust = 0.5),size=1.5)+geom_text(data=props_total_t,aes(label =total,y=prop,x=Genotype), vjust = -.25,size=3)
#
# saveRDS(cluster_abs,here::here("figures","cluster_abs.RDS"))
plot_grid(DimPlot(Seurat_objects$processed,group.by = "Genotype")+theme(title = element_text(size=9)),DimPlot(Seurat_objects$processed,cols = my_cols2,label=T)+NoLegend(),cluster_abs)First we need to extract all genes in the dataset and translate them into EntrezID, this genes will than be used as background for significance testing in the GSEA function:
##check if non protein coding genes give additional information
# Seurat_objects$safe<-Seurat_objects$processed
# Seurat_objects$full<-subset(Seurat_objects$total,cells=colnames(Seurat_objects$processed))
# Seurat_objects$full@reductions$prot_only<- Seurat::CreateDimReducObject(Seurat_objects[["processed"]]@reductions[["umap"]]@cell.embeddings)
# Idents(Seurat_objects$full)<-Idents(Seurat_objects$processed)
# DimPlot(Seurat_objects$full,reduction = "prot_only")
# Seurat_objects$processed<-Seurat_objects$full
###no real plus, so kept the protein coding only
###need to execute after every restart, otherwise Error: external pointer is not valid
OrgDb = org.Mm.eg.db
###prepare background genes
###select all genes in dataset ... required for stastitical testing ... see "universe" in enricher()
#TODO: Try using only the genes found in the cluster as background, at least for comparing the genotypes
background <- as.matrix(GetAssayData(object = Seurat_objects$processed, slot = 'counts'))
background <- rownames(background[apply(background, 1, function (x) {sum(x >= 1)})>3,])
background <- bitr(background, fromType = "SYMBOL", toType="ENTREZID", OrgDb=OrgDb)
#> 'select()' returned 1:1 mapping between keys and columns
#> Warning in bitr(background, fromType = "SYMBOL", toType = "ENTREZID", OrgDb =
#> OrgDb): 0.37% of input gene IDs are fail to map...
### Get human homologues for mouse genes
background_hsa <- getLDS(attributes = c("entrezgene_id"),
filters = "entrezgene_id",
values = background$ENTREZID,
mart = mouse_ensembl,
attributesL = c("entrezgene_id"),
martL =human_ensembl,
uniqueRows=T)
colnames(background_hsa)<-c("entrez_m","entrez_h")
background_hsa$entrez_m<-as.character(background_hsa$entrez_m)
background_hsa$entrez_h<-as.character(background_hsa$entrez_h)
background<-left_join(background,background_hsa,by=c("ENTREZID"="entrez_m"))
##background_hsa was just a helper variable, so lets remove it
rm(background_hsa)In single cell experiments, there are multiple possibilites, which genes to take for GSEA. First, the marker genes of the single cell clusters can be used to confirm our annotation. Seurat::FindMarkers() will be used on the condition variable, which is in our case the cell clusters, total is set to TRUE, so that the Idents are only considered once.
There are 5 available databases “GO”,“KEGG”,“DO”,“Hallmark”,“cannonicalPathways”,“Motifs”,“ImmunoSignatures”. Lets take all of them and perform GSEA in parallel using foreach :
###perform scRNAseq.analysis::GSEA for all available databases, the results will be saved in list called ontology
###scRNAseq.analysis::GSEA no differential expression between genotypes
###perform GSEA for all available databases, the results will be saved in list called ontology
ontology<-list()
ontology_db<-c("GO","KEGG","DO","Hallmark","cannonicalPathways","Motifs","ImmunoSignatures")
Seurat_objects$processed$seurat_clusters<-Idents(Seurat_objects$processed)
ontology[["nogeno"]]<-NULL
ontology[["nogeno"]]<-foreach(i=ontology_db,.errorhandling = "remove",.combine=c,.multicombine = T) %dopar% {
R.utils::withTimeout({scRNAseq.analysis::GSEA(object = Seurat_objects$processed,condition = "seurat_clusters",top = 50,GeneSets = i,total = T,OrgDb = OrgDb,human_ensembl = human_ensembl,mouse_ensembl = mouse_ensembl)},
timeout = 180,cpu=F,onTimeout = "silent")
}
###Here Seurat_objects$processed is my final seurat object
###sometimes GO doesn't work in parallel, just run scRNAseq.analysis::GSEA in one core mode and it should be fine
ontology$nogeno["GO"]<-scRNAseq.analysis::GSEA(object = Seurat_objects$processed,condition = "seurat_clusters",top = 50,GeneSets = "GO",total = T,OrgDb = OrgDb,human_ensembl = human_ensembl,mouse_ensembl = mouse_ensembl)
#> Calculating cluster ss_MG
#> Calculating cluster early_NP
#> Calculating cluster int_homeo_MG
#> Calculating cluster homeo_DAM
#> Calculating cluster inflam_DAM
#> Calculating cluster int_MG
#> Calculating cluster int_NP
#> Calculating cluster type_1_MF
#> Calculating cluster CAM
#> Calculating cluster homeo_MG
#> Calculating cluster inflam_MG
#> Calculating cluster inflam_NP
#> Calculating cluster SC
#> Calculating cluster MC
#> Calculating cluster IR_MF
#> Calculating cluster IR_DAM
#> 'select()' returned 1:1 mapping between keys and columns
#> Warning in clusterProfiler::bitr(top$gene, fromType = "SYMBOL", toType =
#> "ENTREZID", : 1.62% of input gene IDs are fail to map...Furthermore, DE genes between Genotypes can be used for GSEA. Here our condition is the Genotype meta data variable, and total=T needs to be set, so that the single cell clusters are not considered in the analysis:
### make scRNAseq.analysis::GSEA for the total cells (all clusters together) comparing WT and KO, if nothing found will be ignored (.errorhandling=remove) ... if you want to see the errormessage set .errorhandling="pass"
ontology[["total"]]<-NULL
ontology[["total"]]<-foreach(i=ontology_db,.errorhandling="remove",.combine=c,.multicombine = T) %dopar% {
R.utils::withTimeout({scRNAseq.analysis::GSEA(object = Seurat_objects$processed,condition = "Genotype",top = 50,GeneSets = i,total = T,OrgDb = OrgDb,human_ensembl = human_ensembl,mouse_ensembl = mouse_ensembl)},
timeout = 180,cpu=F,onTimeout = "silent")
}Finally, the DE Genes between Genotypes in each clustered can be used for GSEA, by setting total=F:
###Then scRNAseq.analysis::GSEA for each cluster individually
ontology[["cluster"]]<-NULL
ontology[["cluster"]]<-foreach(i=ontology_db,.errorhandling = "remove",.combine=c,.multicombine = T) %dopar% {
R.utils::withTimeout({scRNAseq.analysis::GSEA(object = Seurat_objects$processed,condition = "Genotype",top = 50,GeneSets = i,total = F,OrgDb = OrgDb,human_ensembl = human_ensembl,mouse_ensembl = mouse_ensembl)},
timeout = 180,cpu=F,onTimeout = "silent")
}
###Here Seurat_objects$processed is my final seurat object
###sometimes GO doesn't work in parallel, just run scRNAseq.analysis::GSEA in one core mode and it should be fine
ontology$cluster["GO"]<-scRNAseq.analysis::GSEA(object = Seurat_objects$processed,condition = "Genotype",top = 50,GeneSets = "GO",total = F,OrgDb = OrgDb,human_ensembl = human_ensembl,mouse_ensembl = mouse_ensembl)
#> [1] "ss_MG"
#> Calculating cluster WT
#> Calculating cluster KO
#> [1] "early_NP"
#> Calculating cluster WT
#> Calculating cluster KO
#> [1] "int_homeo_MG"
#> Calculating cluster WT
#> Calculating cluster KO
#> [1] "homeo_DAM"
#> Calculating cluster WT
#> Calculating cluster KO
#> [1] "inflam_DAM"
#> Calculating cluster WT
#> Calculating cluster KO
#> [1] "int_MG"
#> Calculating cluster WT
#> Calculating cluster KO
#> [1] "int_NP"
#> Calculating cluster WT
#> Calculating cluster KO
#> [1] "type_1_MF"
#> Calculating cluster WT
#> Calculating cluster KO
#> [1] "CAM"
#> Calculating cluster WT
#> Calculating cluster KO
#> [1] "homeo_MG"
#> Calculating cluster WT
#> Calculating cluster KO
#> [1] "inflam_MG"
#> Calculating cluster WT
#> Calculating cluster KO
#> [1] "inflam_NP"
#> Calculating cluster WT
#> Calculating cluster KO
#> [1] "SC"
#> Calculating cluster WT
#> Calculating cluster KO
#> [1] "MC"
#> Calculating cluster WT
#> Calculating cluster KO
#> [1] "IR_MF"
#> Calculating cluster WT
#> Calculating cluster KO
#> [1] "IR_DAM"
#> Calculating cluster WT
#> Calculating cluster KO
#> 'select()' returned 1:1 mapping between keys and columns
#> Warning in clusterProfiler::bitr(top$gene, fromType = "SYMBOL", toType =
#> "ENTREZID", : 1.24% of input gene IDs are fail to map...Lets only put WT on the left side of the plot:
###relevel the genotype, so that WT appears on left side
for(i in ontology_db){
ontology[["total"]][[i]]@compareClusterResult[["cluster"]]<-as.factor(ontology[["total"]][[i]]@compareClusterResult[["cluster"]])
ontology[["total"]][[i]]@compareClusterResult[["cluster"]]<-relevel(ontology[["total"]][[i]]@compareClusterResult[["cluster"]],"WT")
}
for(i in ontology_db){
ontology[["cluster"]][[i]]@compareClusterResult[["Condition"]]<-as.factor(ontology[["cluster"]][[i]]@compareClusterResult[["Condition"]])
ontology[["cluster"]][[i]]@compareClusterResult[["Condition"]]<-relevel(ontology[["cluster"]][[i]]@compareClusterResult[["Condition"]],"WT")
}We created now only GSEA objects. To visualise the results dotplot can be used, here we can create a list of ggplots, so that we don’t need to run ggplot for each database:
### Create dotplots
###create dotplots for all the ontologies created above
ontology_plots<-list()
ontology_plots[["total"]]<-foreach(i=names(ontology$total),.packages = c("clusterProfiler"),.combine=c,.multicombine=T)%dopar%{
ontology_plots[[i]]<-clusterProfiler::dotplot(ontology$total[[i]],x=~cluster,show=5)+theme(axis.text.x = element_text(angle=60,vjust = 1, hjust=1,size=3),axis.text.y = element_text(size=3),title=element_text(size=5),legend.key.size = unit(0.2,"line"),legend.text = element_text(size=3))+ggtitle(i)+scale_size(range = c(0.1,1))
return(ontology_plots)
}
###create dot plots in separate plots for WT and KO
plots<-list()
ontology_plots[["cluster"]]<-NULL
ontology_plots[["cluster"]]<-foreach(i=names(ontology$cluster),.packages = c("clusterProfiler"),.combine=c,.multicombine=T)%dopar%{
plots[[i]]<-dotplot(ontology$cluster[[i]],x=~cluster,show=5)+facet_wrap(~Condition)+theme(axis.text.x = element_text(angle=60,vjust = 1, hjust=1,size=3),strip.text = element_text(size=4,margin = margin(0.5,0,0.5,0, "mm")),axis.text.y = element_text(size=3),title=element_text(size=5),legend.key.size = unit(0.2,"line"),legend.text = element_text(size=3))+ggtitle(i)+scale_size(range = c(0.1,1))
return(plots)
}
####create dotplot without comparing genotypes for each cluster
plots<-list()
ontology_plots[["nogeno"]]<-NULL
ontology_plots[["nogeno"]]<-foreach(i=names(ontology$nogeno),.packages = c("clusterProfiler"),.combine=c,.multicombine=T)%dopar%{
plots[[i]]<-dotplot(ontology$nogeno[[i]],x=~cluster,show=5)+theme(axis.text.x = element_text(angle=60,vjust = 1, hjust=1,size=3),axis.text.y = element_text(size=3),title=element_text(size=5),legend.key.size = unit(0.2,"line"),legend.text = element_text(size=3))+ggtitle(i)+scale_size(range = c(0.1,1))
return(plots)
}
# saveRDS(ontology_plots,here::here("RImages/SeqWell_myeolid/Exp_191114","ontology_plots.RDS"))Using plot_grid, multiple results dotplots can be visualised at once. For each type of marker genes we used, an entry in the ontology_plots list was created. The next level in the list are the single databases, they can be accessed by name or position:
###plot all in one: probably too much, thus need to subset: here only plot 7 will be plotted
levels(Idents(Seurat_objects$processed))
#> [1] "ss_MG" "early_NP" "int_homeo_MG" "homeo_DAM" "inflam_DAM"
#> [6] "int_MG" "int_NP" "type_1_MF" "CAM" "homeo_MG"
#> [11] "inflam_MG" "inflam_NP" "SC" "MC" "IR_MF"
#> [16] "IR_DAM"
plot_grid(plotlist = ontology_plots$nogeno["GO"])plot_grid(plotlist = ontology_plots$total[c("GO","DO","KEGG","Motifs")])plot_grid(plotlist = ontology_plots$cluster[c(1:3)])In the same way, a network visualisation can be used:
###for visualisation as a network package enrichplot required
library(enrichplot)
###same as above but visualisation as emaplot(network)
emaplots<-list()
emaplots[["total"]]<-foreach(i=names(ontology$total),.packages = c("clusterProfiler"),.combine=c,.multicombine=T,.errorhandling = "remove")%dopar%{
emaplots[[i]]<-emapplot(pairwise_termsim(ontology$total[[i]]),showCategory = 15)
return(emaplots)
}
emaplots[["cluster"]]<-NULL
plots<-list()
emaplots[["cluster"]]<-foreach(i=names(ontology$cluster),.packages = c("clusterProfiler"),.combine=c,.multicombine=T)%dopar%{
plots[[i]]<-emapplot(pairwise_termsim(ontology$cluster[[i]]),showCategory = 15)
return(plots)
}
plots<-list()
emaplots[["nogeno"]]<-NULL
emaplots[["nogeno"]]<-foreach(i=names(ontology$nogeno),.packages = c("clusterProfiler"),.combine=c,.multicombine=T)%dopar%{
plots[[i]]<-emapplot(pairwise_termsim(ontology$nogeno[[i]]),showCategory = 15)
return(plots)
}
plot_grid(plotlist = emaplots$nogeno["GO"])####Volcano Plots ####
tmp<-Seurat_objects$processed
dataset.markers$cluster<-list()
for (i in levels(Idents(tmp))){
print(i)
tmp2<-subset(tmp,idents=i)
Idents(object = tmp2) <- tmp2[[]][,"Genotype"]
dataset.markers$cluster[[i]] <-FindMarkers(object = tmp2,ident.1 = "KO",ident.2 = "WT",logfc.threshold = 0.1)
# colnames(dataset.markers$cluster[[i]])[6]<-"Condition"
dataset.markers$cluster[[i]]$cluster<-as.factor(i)
}
#> [1] "ss_MG"
#> [1] "early_NP"
#> [1] "int_homeo_MG"
#> [1] "homeo_DAM"
#> [1] "inflam_DAM"
#> [1] "int_MG"
#> [1] "int_NP"
#> [1] "type_1_MF"
#> [1] "CAM"
#> [1] "homeo_MG"
#> [1] "inflam_MG"
#> [1] "inflam_NP"
#> [1] "SC"
#> [1] "MC"
#> [1] "IR_MF"
#> [1] "IR_DAM"
Idents(tmp)<-tmp$Genotype
dataset.markers$geno_total<-FindMarkers(tmp,ident.1 = "KO",ident.2 = "WT",logfc.threshold = 0.1)
rm(tmp,tmp2)
highlight<-c("Camk1d","Cmss1","Tmem119","Apc","Ctsb","Tmsb4x","S100a9","Fkbp5","Apoe","Anapc15","Ctsb","Junb","Psap","Clec7a","Lgals3bp","Tmem9","Cdk17")
# dataset.markers$cluster<-do.call(rbind,dataset.markers$cluster)
# dataset.markers$cluster %>% group_by(cluster,Condition) %>% top_n(n = 50, wt = avg_log2FC) -> top
Volcano_plots<-list()
###create list of volcano plots
Volcano_plots$total<-Volcano_plot(dataset.markers$geno_total,FC_cutoff=0.25,p_cutoff=1e-30,legend="none",titles = "Total",lab.size = 2,title.size = 6)+xlab(NULL)+ylab(NULL)+theme(axis.ticks = element_line(size=.3),axis.line = element_line(size=.3))
#> Registered S3 methods overwritten by 'ggalt':
#> method from
#> grid.draw.absoluteGrob ggplot2
#> grobHeight.absoluteGrob ggplot2
#> grobWidth.absoluteGrob ggplot2
#> grobX.absoluteGrob ggplot2
#> grobY.absoluteGrob ggplot2
for(i in names(dataset.markers$cluster)){
Volcano_plots[[i]]<-Volcano_plot(dataset.markers$cluster[[i]],do.col=F,Condition="cluster",FC_cutoff=0.25,p_cutoff=5e-02,legend="none",lab.size=2,title.size=6)+xlab(NULL)+ylab(NULL)+theme(axis.ticks = element_line(size=.3),axis.line = element_line(size=.3))
}
Volcano_plots<-Volcano_plots[!(names(Volcano_plots)%in%c("inflam_DAM","inflam_NP","MC","IR_MF","int_homeo_MG","homeo_DAM","type_1_MF"))]
#add global x and y axes
y.grob <- textGrob("-Log10(p)",
gp=gpar(fontface="bold", col="black", fontsize=10), rot=90)
x.grob <- textGrob("Log2(FC)",
gp=gpar(fontface="bold", col="black", fontsize=10))
allVolcano<-plot_grid(plotlist = Volcano_plots)+geom_segment(aes(x = 0.01, y = -0.01, xend = 0.01, yend = 0.95),
arrow = arrow(length = unit(0.2, "cm")))+geom_segment(aes(x = -0.01, y = 0, xend = 0.95, yend = 0),
arrow = arrow(length = unit(0.2, "cm")))+theme(plot.margin=unit(c(7,7,7,7),"mm"))
grid.arrange(arrangeGrob(allVolcano,
left = y.grob, bottom = x.grob))
processed<-Seurat_objects$processed
# saveRDS(Volcano_plots,here::here("RImages/SeqWell_myeolid/Exp_191114","Volcano_plots.RDS"))